# import libraries
import pandas as pd
import plotly.express as px
from datetime import datetime, timedelta
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
import scipy.stats as stats
%matplotlib inline
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
import requests
import urllib
import json
#function for the quick analysis of dataset
def analyze (dataset):
dataset.columns = [x.lower().replace(' ', '_') for x in dataset.columns.values]
dataset.info()
display('*'*50)
display(dataset.describe())
display('*'*50)
display(dataset.head())
display('*'*50)
display('Number of duplicate values', dataset.duplicated().sum())
df = pd.read_csv('rest_data.csv')
analyze(df)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 15366 entries, 0 to 15365 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 15366 non-null int64 1 object_name 15366 non-null object 2 chain 15366 non-null object 3 object_type 15366 non-null object 4 address 15366 non-null object 5 number 15366 non-null int64 dtypes: int64(2), object(4) memory usage: 720.4+ KB
'**************************************************'
| id | number | |
|---|---|---|
| count | 15366.000000 | 15366.000000 |
| mean | 119720.066901 | 59.547182 |
| std | 73036.130732 | 74.736833 |
| min | 838.000000 | 0.000000 |
| 25% | 28524.000000 | 12.000000 |
| 50% | 144974.500000 | 40.000000 |
| 75% | 184262.250000 | 80.000000 |
| max | 223439.000000 | 1700.000000 |
'**************************************************'
| id | object_name | chain | object_type | address | number | |
|---|---|---|---|---|---|---|
| 0 | 151635 | СМЕТАНА | нет | кафе | город Москва, улица Егора Абакумова, дом 9 | 48 |
| 1 | 77874 | Родник | нет | кафе | город Москва, улица Талалихина, дом 2/1, корпус 1 | 35 |
| 2 | 24309 | Кафе «Академия» | нет | кафе | город Москва, Абельмановская улица, дом 6 | 95 |
| 3 | 21894 | ПИЦЦЕТОРИЯ | да | кафе | город Москва, Абрамцевская улица, дом 1 | 40 |
| 4 | 119365 | Кафе «Вишневая метель» | нет | кафе | город Москва, Абрамцевская улица, дом 9, корпус 1 | 50 |
'**************************************************'
'Number of duplicate values'
0
The function for the preliminary analysis showed 0 duplicates, but we have a column with unique names - "id". Let's exclude this column and check the data again for duplicates.
data_without_id = df.drop(columns = ['id'],axis = 1)
display('Number of duplicates', data_without_id.duplicated().sum())
'Number of duplicates'
82
The number of duplicates takes less than 1% from the total number of records, we can delete duplicates.
key = ['object_name', 'chain', 'object_type', 'address', 'number']
data = df.drop_duplicates(subset=key)
display(df.shape)
display(data.shape)
(15366, 6)
(15284, 6)
The new dataframe is shorter for 82 records.
data['chain'] = data['chain'].str.strip()
data['chain'] = data['chain'].replace('нет', 'non-chain')
data['chain'] = data['chain'].replace('да', 'chain')
The minimum value in the column "Number" (number of seats) is 0. Let's look at these values in more detail:
data_0 = data.query('number == 0')
data_0
| id | object_name | chain | object_type | address | number | |
|---|---|---|---|---|---|---|
| 137 | 115498 | Пекарня & Донер | non-chain | закусочная | город Москва, Бутырская улица, дом 9, корпус 2 | 0 |
| 154 | 152556 | Шаурма в пите | non-chain | предприятие быстрого обслуживания | город Москва, Варшавское шоссе, дом 72, корпус 2 | 0 |
| 157 | 149568 | Суши | non-chain | предприятие быстрого обслуживания | город Москва, улица Маршала Василевского, дом ... | 0 |
| 190 | 126956 | Шаурма, куры-гриль | non-chain | предприятие быстрого обслуживания | город Москва, 1-я Владимирская улица, дом 21 | 0 |
| 195 | 149531 | Шаурма, куры-гриль | non-chain | предприятие быстрого обслуживания | город Москва, 2-я Владимирская улица, дом 44/10 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 15334 | 211228 | МИЛТИ | chain | магазин (отдел кулинарии) | город Москва, Дубравная улица, дом 34/29 | 0 |
| 15335 | 211221 | МИЛТИ | chain | магазин (отдел кулинарии) | город Москва, Пятницкое шоссе, дом 3 | 0 |
| 15336 | 211209 | Готовые блюда Милти | chain | предприятие быстрого обслуживания | город Москва, Ореховый бульвар, дом 22А | 0 |
| 15337 | 208477 | Милти | chain | предприятие быстрого обслуживания | город Москва, 3-й Крутицкий переулок, дом 18 | 0 |
| 15338 | 211201 | Милти | chain | магазин (отдел кулинарии) | город Москва, Новослободская улица, дом 4 | 0 |
1589 rows × 6 columns
The lack of seats in cooking or a fast-food company "Шаурма, куры-гриль" looks reasonable - usually all of those are "to-go" format. Let's see what types of objects were affected by the lack of seats.
data_0.groupby('object_type').id.agg(['count']).sort_values('count', ascending=False)
| count | |
|---|---|
| object_type | |
| предприятие быстрого обслуживания | 690 |
| кафе | 306 |
| закусочная | 182 |
| магазин (отдел кулинарии) | 145 |
| кафетерий | 111 |
| буфет | 64 |
| ресторан | 44 |
| бар | 28 |
| столовая | 19 |
Basically, the lack of seats affected quick service points, culinary departments in stores, snack bars, buffets, etc. Nevertheless, there are restaurants, bars and cafes without seats. Since there are quite a lot of such establishments in the general array, we assume that these establishments are currently working on delivery and pickup.
It can also be seen that in the "object_name" column, not all chain establishments are listed monotonously - for example, there are names "Готовые блюда Милти" and "МИЛТИ", "Милти", which is the name of one network. We are unifying the names for the largest chains.
data_chain = data.query('chain == "chain"')
len(data_chain['object_name'].unique())
724
There are 724 unique names for the chain restaurants in the data.
#look at the most common duplicates
data_chain['object_name'].value_counts().head(50)
KFC 155 Шоколадница 141 Бургер Кинг 122 Макдоналдс 121 Домино'с Пицца 86 Теремок 84 Крошка Картошка 82 Милти 65 Суши Wok 61 Папа Джонс 50 Кофе с собой 42 Чайхона №1 42 Хинкальная 35 Якитория 33 Додо Пицца 31 Starbucks 30 Кофе Хаус 30 Subway 29 МАКДОНАЛДС 29 Тануки 26 Хлеб насущный 24 Мята Lounge 23 Иль Патио 23 Cofix 22 Прайм стар 22 Старбакс 21 Пицца Хат 17 Сабвей 17 ШОКОЛАДНИЦА 16 Пицца Паоло 15 СушиШоп 15 Кулинарная лавка братьев Караваевых 15 Му-Му 15 Алло Пицца 14 Кафе «Шоколадница» 14 Кофемания 13 Баскин Роббинс 12 Волконский 12 Андерсон 12 Штолле 12 Кружка 11 СтардогS 11 Кафе «KFC» 11 БУРГЕР КИНГ 11 Суши Вок 10 Суши Сет 10 Вареничная №1 10 Ваби-Саби 10 ТЕРЕМОК 10 Додо пицца 10 Name: object_name, dtype: int64
#function to unify the names of the largest chains in the data
def chain_name(row):
if 'KFC' in row:
return "KFC"
elif ('Старбакс' in row) | ('Starbucks' in row):
return "Starbucks"
elif 'БУРГЕР КИНГ' in row:
return "Бургер Кинг"
elif ('Шоколадница' in row) | ('ШОКОЛАДНИЦА' in row):
return "Шоколадница"
elif ('Макдоналдс' in row) | ('МАКДОНАЛДС' in row):
return "Макдоналдс"
elif ('ТЕРЕМОК' in row) | ('Теремок' in row):
return "Теремок"
elif ('Милти' in row) | ('МИЛТИ' in row):
return "Милти"
elif ('Сабвей' in row) | ('САБВЕЙ' in row):
return "Subway"
elif 'Додо пицца' in row:
return "Додо Пицца"
elif 'Му-му' in row:
return "МУ-МУ"
elif 'Якитория' in row:
return "Якитория"
else:
return row
data['object_name'] = data['object_name'].apply(chain_name)
#print result
data_chain = data.query('chain == "chain"')
data_chain['object_name'].value_counts().head(50)
KFC 188 Шоколадница 185 Макдоналдс 172 Бургер Кинг 136 Теремок 110 Домино'с Пицца 86 Крошка Картошка 82 Милти 81 Starbucks 69 Суши Wok 61 Папа Джонс 50 Subway 50 Кофе с собой 42 Чайхона №1 42 Додо Пицца 41 Якитория 41 Хинкальная 35 Кофе Хаус 30 Тануки 26 Хлеб насущный 24 Мята Lounge 23 Иль Патио 23 Прайм стар 22 Cofix 22 Пицца Хат 17 Пицца Паоло 15 Кулинарная лавка братьев Караваевых 15 Му-Му 15 СушиШоп 15 Алло Пицца 14 Кофемания 13 Волконский 12 Андерсон 12 Баскин Роббинс 12 Штолле 12 МУ-МУ 11 СтардогS 11 Кружка 11 Ваби-Саби 10 Суши Вок 10 Вареничная №1 10 Суши Сет 10 Братья Караваевы 9 Французская выпечка 9 То Да Сё 9 Иль Патио Планета Суши 9 Джон Джоли 8 Воккер 8 Грабли 8 Прайм-кафе 7 Name: object_name, dtype: int64
len(data_chain['object_name'].unique())
645
The number of unique names for chain establishments has decreased from 724 to 645. We unified the names for the largest chains - at the previous step, the number of restaurants at the leader - KFC - was 155, now it is 188.
For further analysis, we will select the street name from the address in a separate column.
data['address'] = data['address'].str.replace('город Москва, ', '')
split_address = data['address'].str.split(', ', expand=True)
data['street'] = split_address[[0]]
data['street'].value_counts()
город Зеленоград 232
проспект Мира 204
Профсоюзная улица 182
Ленинградский проспект 172
Пресненская набережная 167
...
улица Немчинова 1
улица Лапина 1
Есенинский бульвар 1
улица Брусилова 1
3-й Лихачёвский переулок 1
Name: street, Length: 1854, dtype: int64
fig = px.bar(data, x='object_type')
fig.update_traces(marker_color='rgb(158,202,225)',
marker_line_color='rgb(0,110,127)',
)
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Quantity of public catering facilities by types',
yaxis=dict(
title='Number of restaurants',
titlefont_size=16,
tickfont_size=14,
))
fig.show()
Cafes are the most popular type of catering. Apparently, this type of establishments is focused on representatives of the middle class and on everyday meals. In second place in terms of the number of points with a significant lag is the dining room (economy class, not so marginally), followed by a restaurant with a slight lag - an institution for business class visitors or for special occasions.
fig = px.bar(data, x='chain')
fig.update_traces(marker_line_color='rgb(0,110,127)',
)
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Quantity of non-chain restaurants and chains restaurants',
yaxis=dict(
title='Quantity',
titlefont_size=16,
tickfont_size=14,
),
legend=dict(
x=0,
y=1.0,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
),
height=500,
width=600
)
fig.show()
There are about four times more non-chain establishments in Moscow than chain ones.
data_1 = data.groupby(['object_type', 'chain']).count().reset_index(level=[0,1])
data_1
| object_type | chain | id | object_name | address | number | street | |
|---|---|---|---|---|---|---|---|
| 0 | бар | chain | 37 | 37 | 37 | 37 | 37 |
| 1 | бар | non-chain | 818 | 818 | 818 | 818 | 818 |
| 2 | буфет | chain | 11 | 11 | 11 | 11 | 11 |
| 3 | буфет | non-chain | 566 | 566 | 566 | 566 | 566 |
| 4 | закусочная | chain | 56 | 56 | 56 | 56 | 56 |
| 5 | закусочная | non-chain | 293 | 293 | 293 | 293 | 293 |
| 6 | кафе | chain | 1396 | 1396 | 1396 | 1396 | 1396 |
| 7 | кафе | non-chain | 4675 | 4675 | 4675 | 4675 | 4675 |
| 8 | кафетерий | chain | 52 | 52 | 52 | 52 | 52 |
| 9 | кафетерий | non-chain | 343 | 343 | 343 | 343 | 343 |
| 10 | магазин (отдел кулинарии) | chain | 78 | 78 | 78 | 78 | 78 |
| 11 | магазин (отдел кулинарии) | non-chain | 195 | 195 | 195 | 195 | 195 |
| 12 | предприятие быстрого обслуживания | chain | 788 | 788 | 788 | 788 | 788 |
| 13 | предприятие быстрого обслуживания | non-chain | 1110 | 1110 | 1110 | 1110 | 1110 |
| 14 | ресторан | chain | 543 | 543 | 543 | 543 | 543 |
| 15 | ресторан | non-chain | 1739 | 1739 | 1739 | 1739 | 1739 |
| 16 | столовая | chain | 3 | 3 | 3 | 3 | 3 |
| 17 | столовая | non-chain | 2581 | 2581 | 2581 | 2581 | 2581 |
data_1 = data.groupby(['object_type', 'chain']).count().reset_index(level=[0,1])
fig = px.bar(data_1, x="object_type", y="id", color="chain", barmode='stack')
fig.update_layout(xaxis={'categoryorder':'total descending'},
title_text='Chain and non-chain restaurants by quantity and type',
yaxis=dict(
title='Quantity',
titlefont_size=16,
tickfont_size=14,
)
)
fig.show()
Cafes, restaurants and fast-food businesses are characterized by network distribution. What is the distribution strategy chosen by chain restaurants: are there many places with a small number of seats in each or are there few points with a large number of seats?
data_chain = data.query('chain == "chain"')
data_chain = data_chain.groupby('object_name').number.agg(['count','mean']).sort_values('count', ascending=False).reset_index(level=[0,0])
#exclude the chains in which there is 1 point, since technically they are not yet chains
data_chain = data_chain.query('count != 1')
data_chain.rename(columns = {'count' : 'number', 'mean' : 'seats_average'}, inplace = True)
data_chain
| object_name | number | seats_average | |
|---|---|---|---|
| 0 | KFC | 188 | 55.835106 |
| 1 | Шоколадница | 185 | 57.648649 |
| 2 | Макдоналдс | 172 | 87.523256 |
| 3 | Бургер Кинг | 136 | 47.235294 |
| 4 | Теремок | 110 | 27.254545 |
| ... | ... | ... | ... |
| 237 | ИЛЬ ПАТИО | 2 | 56.500000 |
| 238 | Бакинский бульвар | 2 | 213.000000 |
| 239 | ГРАБЛИ | 2 | 85.000000 |
| 240 | Изба | 2 | 17.500000 |
| 241 | Вкусняшка | 2 | 25.000000 |
242 rows × 3 columns
ax = sns.jointplot(data=data_chain, x="seats_average", y="number")
ax.ax_joint.set_xlabel('Average number of seats', fontsize=14)
ax.ax_joint.set_ylabel('Quantity of points in the chain', fontsize=14)
ax.ax_joint.axvline(x=data_chain.seats_average.median())
ax.ax_joint.axhline(y=data_chain.seats_average.mean())
plt.title('The ratio of the number of seats and the number of establishments in the chain', y=1.3, fontsize = 16)
sns.despine()
The main part of the establishments is located in the range with the number of seats up to 100. Despite the economies of scale when building a logistics chain or hiring staff with a large number of seats, small outlets require less investment to open and less rent, in addition, previously it was clear from the data that some establishments work "to take away".
data_fig = data.groupby('object_type').number.agg(['mean']).sort_values('mean', ascending=False).reset_index()
fig = px.bar(data_fig, x='object_type', y='mean')
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Average number of seats in establishments',
yaxis=dict(
title='Number of seats',
titlefont_size=16,
tickfont_size=14,
))
fig.show()
On average, the largest number of seats are provided by canteens and restaurants. Canteens are focused on fast service and a standardized approach, while restaurants are focused on large banquets.
#removing of "Зеленоград" and "Сосенское" settlement from the analysis as those are suburbs of Moscow
data_streets = data.query('street != "город Зеленоград" & street != "поселение Сосенское"')
data_streets = data_streets.groupby('street').count().sort_values(by='id', ascending=False).head(10)
data_streets = data_streets.reset_index()
data_streets
| street | id | object_name | chain | object_type | address | number | |
|---|---|---|---|---|---|---|---|
| 0 | проспект Мира | 204 | 204 | 204 | 204 | 204 | 204 |
| 1 | Профсоюзная улица | 182 | 182 | 182 | 182 | 182 | 182 |
| 2 | Ленинградский проспект | 172 | 172 | 172 | 172 | 172 | 172 |
| 3 | Пресненская набережная | 167 | 167 | 167 | 167 | 167 | 167 |
| 4 | Варшавское шоссе | 162 | 162 | 162 | 162 | 162 | 162 |
| 5 | Ленинский проспект | 148 | 148 | 148 | 148 | 148 | 148 |
| 6 | проспект Вернадского | 128 | 128 | 128 | 128 | 128 | 128 |
| 7 | Кутузовский проспект | 114 | 114 | 114 | 114 | 114 | 114 |
| 8 | Каширское шоссе | 111 | 111 | 111 | 111 | 111 | 111 |
| 9 | Кировоградская улица | 108 | 108 | 108 | 108 | 108 | 108 |
fig = px.bar(data_streets, x='street', y='id')
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Top 10 streets by the number of public catering facilities',
yaxis=dict(
title='Number of establishements',
titlefont_size=16,
tickfont_size=14,
))
fig.show()
#save the list of the most popular streets in a separate variable
street_list = ['проспект Мира','Профсоюзная улица','Ленинградский проспект','Пресненская набережная','Варшавское шоссе','Ленинский проспект','проспект Вернадского','Кутузовский проспект','Каширское шоссе','Кировоградская улица']
#upload external data on districts and street names in Moscow
df_moscow=pd.read_csv('mosgaz-streets.csv')
df_moscow.sample(10)
| streetname | areaid | okrug | area | |
|---|---|---|---|---|
| 977 | Буракова улица | 36 | ВАО | Район Соколиная Гора |
| 3088 | Коллективный проезд | 107 | ЮАО | Район Царицыно |
| 2044 | Коптевская улица | 62 | САО | Район Коптево |
| 600 | 1-й Николощеповский переулок | 12 | ЦАО | Район Арбат |
| 454 | Театральный проезд | 19 | ЦАО | Тверской район |
| 33 | Большой Кондратьевский переулок | 17 | ЦАО | Пресненский район |
| 1571 | 1-я Карпатская улица | 46 | ЗАО | Район Солнцево |
| 1100 | МКАД, 107-й километр | 37 | ВАО | Район Ивановское |
| 3547 | Упорный переулок | 114 | ЮВАО | Район Лефортово |
| 1358 | улица Леси Украинки | 40 | ЗАО | Район Кунцево |
df_most_popular=df_moscow.query('streetname in @street_list')
#filter needed data
df_most_popular = df_most_popular.groupby('okrug')['streetname'].count()
df_most_popular
okrug ЗАО 7 САО 4 СВАО 6 ЦАО 3 ЮАО 14 ЮЗАО 14 Name: streetname, dtype: int64
The most popular streets for catering facilities are located in the South-Western Administrative District, Southern and Western.
data_oneobject = data.groupby('street').count().reset_index()
data_oneobject = data_oneobject.query('id == 1')
display('Number of streets with one catering facility:', len(data_oneobject))
'Number of streets with one catering facility:'
541
#create a variable with a list of unpopular streets
nonpopular_streets = data_oneobject['street']
df_least_popular=df_moscow.query('streetname in @nonpopular_streets')
#filtering the necessary data
df_least_popular = df_least_popular.groupby('okrug')['streetname'].count()
df_least_popular
okrug ВАО 72 ЗАО 43 Зеленоград 2 САО 55 СВАО 67 СЗАО 27 ЦАО 185 ЮАО 30 ЮВАО 55 ЮЗАО 28 Name: streetname, dtype: int64
There are most streets with 1 catering facility in the Central Administrative District, East and North-East.
data_fig = data.query('street in @street_list')
sns.set(rc={'figure.figsize':(16, 9)})
ax = sns.boxplot(x="street", y="number", data=data_fig, orient='v')
# add stripplot
ax = sns.stripplot(x="street", y="number", data=data_fig, color="orange", jitter=0.2, size=2.5)
# Calculate number of obs per group & median to position labels
medians = data_fig.groupby(['street'])['number'].median().values
nobs = data_fig.groupby("street").size().values
nobs = [str(x) for x in nobs.tolist()]
nobs = ["n: " + i for i in nobs]
# Add it to the plot
pos = range(len(nobs))
for tick,label in zip(pos,ax.get_xticklabels()):
plt.text(pos[tick], medians[tick] + 0.4, nobs[tick], horizontalalignment='center', size='medium', color='w', weight='semibold')
plt.xticks(rotation=45)
ax.set_ylim(-10,300)
plt.title('The number of seats on the most popular streets', fontsize=16)
plt.ylabel('The number of seats', fontsize=14)
plt.show()
data_fig = data_fig.groupby('street').number.agg(['mean']).sort_values('mean', ascending=False).reset_index()
fig = px.bar(data_fig, x='street', y='mean')
fig.update_layout(barmode='stack',
xaxis={'categoryorder':'total descending'},
title_text='Average number of seats in establishements',
yaxis=dict(
title='Number of seats',
titlefont_size=16,
tickfont_size=14,
))
fig.show()
The maximum number of seats from all restaurants in the dataset is 1,700, average (40 seats) significantly different from the median - 40, the third quartile is 80 places (see the section "Loading data and preparing them for analysis"). The average number of seats on the streets with the most public catering facilities is from 45 to 85.
The cafe is a popular format of catering establishments among Moscow population. Unlike bars and canteens, this type of establishment is characterized by network distribution.
We recommend not to break the trends in the market, and open establishments with no more than 50 seats - apparently, this strategy is the most promising, as it has received the most widespread. At the same time, it makes sense to open establishments with 50-80 seats on popular and busy streets (Kutuzovsky Prospekt, Vernadsky Avenue, Leninsky Prospekt, Mira Avenue, etc.).
There are still many streets in the city with 1 catering facility. It is necessary to carefully study the North-Eastern Administrative District as a promising one for opening new points.
It is necessary, if possible, to supplement the market analysis with data on the passability of establishments, the period of existence and the average check.